## Load the libraries
library(tidytuesdayR)
library(ggplot2)
library(plotly)
library(dplyr)
library(tidyr)
library(corrplot)
library(corrgram)
library(usmap)
library(moments)
library(animation)
## Pull the datasets
tuition_cost <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-10/tuition_cost.csv')
tuition_income <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-10/tuition_income.csv')
salary_potential <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-10/salary_potential.csv')
historical_tuition <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-10/historical_tuition.csv')
diversity_school <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-10/diversity_school.csv')
#Modify year
historical_tuition$yearmod <- sapply(strsplit(historical_tuition$year, split='-', fixed=TRUE), function(x) (x[1]))
#Create a subset for All Institutions
historical_tuitionALL <- subset(historical_tuition, historical_tuition$type == "All Institutions")
#Create subsets for all, 2, and 4 year institutions
histall <- subset(historical_tuitionALL, historical_tuitionALL$tuition_type == "All Constant")
hist2year <- subset(historical_tuitionALL, historical_tuitionALL$tuition_type == "2 Year Constant")
hist4year <- subset(historical_tuitionALL, historical_tuitionALL$tuition_type == "4 Year Constant")
## Plot tutition cost over time for all institutions
p <- ggplot(histall, aes(yearmod, tuition_cost)) +
labs(x = "Year", y="Tuition Cost", title = "All Institution Cost over Time") +
theme(axis.text.x = element_text(angle = 90))+
geom_col()
ggplotly(p)
## Plot tutition cost over time for 2 year institutions
p <- ggplot(hist2year, aes(yearmod, tuition_cost)) +
labs(x = "Year", y="Tuition Cost", title = "2 Year Institution Cost over Time") +
theme(axis.text.x = element_text(angle = 90))+
geom_col()
ggplotly(p)
## Plot tutition cost over time for 4 year institutions
p <- ggplot(hist4year, aes(yearmod, tuition_cost)) +
labs(x = "Year", y="Tuition Cost", title = "4 Year Institution Cost over Time") +
theme(axis.text.x = element_text(angle = 90))+
geom_col()
ggplotly(p)
#create line graph to compare the historical trend of tuition costs across different universities
hist_tuition <- historical_tuition %>%
filter(tuition_type == "All Constant" | tuition_type == "4 Year Constant" | tuition_type == "2 Year Constant") %>%
filter(type == "All Institutions")
ts_plot <- ggplot(data = hist_tuition, aes(x = year, group = tuition_type)) +
geom_line(aes(y = tuition_cost, color = tuition_type), size = 1) +
ggtitle("Historical Costs of College \n Adjusted for Inflation") +
theme(axis.text.x = element_text(angle = 90)) +
xlab("Year") +
ylab("College Tuition Cost") +
labs(color = "Type of Degree")
ggplotly(ts_plot)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
Tuition costs have continued to increase over the years, especially in for 4 year institutions, even when controlling for inflation. From 1985 to 2016 the cost of 2 year institutions has gone up $3,090 whereas the cost of 4 year institutions has risen $14,319. There’s a significant difference in the increase of price between the two.
## replace _ with ,
tuition_income$income_lvl <- replace(tuition_income$income_lvl,tuition_income$income_lvl=="48_001 to 75,000","48,001 to 75,000")
lowestincome <- subset(tuition_income, tuition_income$income_lvl == "0 to 30,000")
lowincome <- subset(tuition_income, tuition_income$income_lvl == "30,001 to 48,000")
midincome <- subset(tuition_income, tuition_income$income_lvl == "48,001 to 75,000")
highincome <- subset(tuition_income, tuition_income$income_lvl == "75,001 to 110,000")
highestincome <- subset(tuition_income, tuition_income$income_lvl == "Over 110,000")
#boxplot of income levels and net costs
p <- ggplot(tuition_income, aes(x = income_lvl, y = net_cost, fill = income_lvl)) +
geom_boxplot()+
labs(x = "Income Levels", y="Net Cost", title = "Income Levels and Net Costs Compared", fill = "Income Level")+
theme(axis.text.x = element_text(angle = 90))
ggplotly(p)
## Merge salary potential and tuition cost datasets by name of college
potential_salary <- merge(salary_potential,tuition_cost,by=c("name"))
#ggplot for tuition vs career pay
p <- ggplot(potential_salary, aes(x = in_state_tuition, y = mid_career_pay, fill = type,label = name)) +
geom_point()+
labs(x = "In State Tuition Cost", y="Estimated Mid Career Pay", title = "In State vs. Estimated Mid Career Pay for Types of Instituitions Costs Compared", fill = "Type of College")
ggplotly(p)
## Group in state tution into differnt levels
potential_salary <- potential_salary %>%
mutate(salary_group = case_when(
between(in_state_tuition, 0, 10000) ~ "0 to 10000",
between(in_state_tuition, 10000, 20000) ~ "10000 to 20000",
between(in_state_tuition, 20000, 30000) ~ "20000 to 30000",
between(in_state_tuition, 30000, 40000) ~ "30000 to 40000",
between(in_state_tuition, 40000, 50000) ~ "40000 to 50000",
between(in_state_tuition, 50000, 60000) ~ "50000 to 60000",
between(in_state_tuition, 60000, 70000) ~ "60000 to 70000",
TRUE ~ NA_character_
))
p <- ggplot(potential_salary, aes(x = salary_group, y = mid_career_pay, fill = type)) +
geom_boxplot()+
labs(x = "In State Tuition", y="Estimated Mid Career Pay", title = "In State Tuition and Mid Career Pay Compared", fill = "Type of College")+
theme(axis.text.x = element_text(angle = 45))
ggplotly(p)
cor(potential_salary$in_state_tuition,potential_salary$mid_career_pay)
## [1] 0.5134
cor(potential_salary$in_state_tuition,potential_salary$early_career_pay)
## [1] 0.4781204
## Group in STEM percent into differnt levels
potential_salary <- potential_salary %>%
mutate(stem_group = case_when(
between(stem_percent, 0, 10) ~ "0 to 10",
between(stem_percent, 10, 20) ~ "10 to 20",
between(stem_percent, 20, 30) ~ "20 to 30",
between(stem_percent, 30, 40) ~ "30 to 40",
between(stem_percent, 40, 50) ~ "40 to 50",
between(stem_percent, 50, 60) ~ "50 to 60",
between(stem_percent, 60, 70) ~ "60 to 70",
between(stem_percent, 70, 80) ~ "70 to 80",
between(stem_percent, 80, 90) ~ "80 to 90",
between(stem_percent, 90, 100) ~ "90 to 100",
TRUE ~ NA_character_
))
p <- ggplot(potential_salary, aes(x = stem_group, y = mid_career_pay, fill = type)) +
geom_boxplot()+
labs(x = "STEM Percent", y="Estimated Mid Career Pay", title = "STEM Percent and Mid Career Pay Compared", fill = "Type of School")+
theme(axis.text.x = element_text(angle = 45))
ggplotly(p)
## Group in STEM percent into differnt levels
potential_salary <- potential_salary %>%
mutate(makebetter_group = case_when(
between(make_world_better_percent, 0, 10) ~ "0 to 10",
between(make_world_better_percent, 10, 20) ~ "10 to 20",
between(make_world_better_percent, 20, 30) ~ "20 to 30",
between(make_world_better_percent, 30, 40) ~ "30 to 40",
between(make_world_better_percent, 40, 50) ~ "40 to 50",
between(make_world_better_percent, 50, 60) ~ "50 to 60",
between(make_world_better_percent, 60, 70) ~ "60 to 70",
between(make_world_better_percent, 70, 80) ~ "70 to 80",
between(make_world_better_percent, 80, 90) ~ "80 to 90",
between(make_world_better_percent, 90, 100) ~ "90 to 100",
TRUE ~ "Not Reported"
))
p <- ggplot(potential_salary, aes(x = makebetter_group, y = mid_career_pay, fill = type)) +
geom_boxplot()+
labs(x = "Feel Make World Better Percent", y="Estimated Mid Career Pay", title = "Make World Better Percent and Mid Career Pay Compared")+
theme(axis.text.x = element_text(angle = 45))
ggplotly(p)
#read in more college data
library(readr)
College_Data <- read_csv("College_Data.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## UNITID = col_double(),
## OPEID = col_double(),
## OPEID6 = col_double(),
## HCM2 = col_double(),
## MAIN = col_double(),
## PREDDEG = col_double(),
## HIGHDEG = col_double(),
## CONTROL = col_double(),
## ST_FIPS = col_double(),
## REGION = col_double(),
## LOCALE = col_double(),
## LATITUDE = col_double(),
## LONGITUDE = col_double(),
## CCBASIC = col_double(),
## CCUGPROF = col_double(),
## CCSIZSET = col_double(),
## DISTANCEONLY = col_double(),
## ICLEVEL = col_double()
## )
## See spec(...) for full column specifications.
## Warning: 7 parsing failures.
## row col expected actual file
## 5345 LOCALE a double NULL 'College_Data.csv'
## 5345 LATITUDE a double NULL 'College_Data.csv'
## 5345 LONGITUDE a double NULL 'College_Data.csv'
## 5345 CCBASIC a double NULL 'College_Data.csv'
## 5345 CCUGPROF a double NULL 'College_Data.csv'
## .... ......... ........ ...... ..................
## See problems(...) for more details.
#data wrangling
#select variables
college_df <- College_Data %>%
select(INSTNM, ADM_RATE, ACTCMMID, SAT_AVG, INEXPFTE, C150_4,ICLEVEL, MD_EARN_WNE_P6, DEBT_MDN, GRAD_DEBT_MDN)
college_df$ADM_RATE <- as.double(college_df$ADM_RATE)
## Warning: NAs introduced by coercion
college_df$ACTCMMID <- as.double(college_df$ACTCMMID)
## Warning: NAs introduced by coercion
college_df$SAT_AVG <- as.double(college_df$SAT_AVG)
## Warning: NAs introduced by coercion
college_df$INEXPFTE <- as.double(college_df$INEXPFTE)
## Warning: NAs introduced by coercion
college_df$C150_4 <- as.double(college_df$C150_4)
## Warning: NAs introduced by coercion
college_df$MD_EARN_WNE_P6 <- as.double(college_df$MD_EARN_WNE_P6)
## Warning: NAs introduced by coercion
college_df$DEBT_MDN <- as.double(college_df$DEBT_MDN)
## Warning: NAs introduced by coercion
college_df$GRAD_DEBT_MDN <- as.double(college_df$GRAD_DEBT_MDN)
## Warning: NAs introduced by coercion
#update name of college for joining
colnames(college_df)[colnames(college_df) == "INSTNM"] <- "name"
#select variables
col1 <- tuition_cost %>%
select(c(name, room_and_board, in_state_tuition, out_of_state_tuition))
col2<- salary_potential %>%
select(c(mid_career_pay, make_world_better_percent, stem_percent, name))
#joing datasets
col <- inner_join(col1, col2, by = "name")
college_factors <- inner_join(col, college_df, by = "name")
#view summary of data
summary(college_factors)
## name room_and_board in_state_tuition out_of_state_tuition
## Length:716 Min. : 1698 Min. : 4118 Min. : 4118
## Class :character 1st Qu.: 8948 1st Qu.:10021 1st Qu.:20754
## Mode :character Median :10550 Median :26000 Median :29188
## Mean :10924 Mean :26597 Mean :30862
## 3rd Qu.:12880 3rd Qu.:40708 3rd Qu.:41038
## Max. :18156 Max. :58230 Max. :58230
## NA's :35
## mid_career_pay make_world_better_percent stem_percent ADM_RATE
## Min. : 60100 Min. :33.00 Min. : 0.00 Min. :0.0436
## 1st Qu.: 81100 1st Qu.:48.00 1st Qu.: 7.00 1st Qu.:0.5458
## Median : 89100 Median :52.00 Median : 12.50 Median :0.6895
## Mean : 92114 Mean :53.48 Mean : 16.55 Mean :0.6527
## 3rd Qu.:100300 3rd Qu.:58.00 3rd Qu.: 22.00 3rd Qu.:0.8114
## Max. :158200 Max. :94.00 Max. :100.00 Max. :1.0000
## NA's :28 NA's :58
## ACTCMMID SAT_AVG INEXPFTE C150_4 ICLEVEL
## Min. :17.00 Min. : 925 Min. : 0 Min. :0.0000 Min. :1
## 1st Qu.:22.00 1st Qu.:1092 1st Qu.: 7466 1st Qu.:0.4503 1st Qu.:1
## Median :24.00 Median :1155 Median : 9890 Median :0.5941 Median :1
## Mean :24.61 Mean :1180 Mean : 12814 Mean :0.5945 Mean :1
## 3rd Qu.:27.00 3rd Qu.:1239 3rd Qu.: 14158 3rd Qu.:0.7320 3rd Qu.:1
## Max. :36.00 Max. :1566 Max. :108509 Max. :1.0000 Max. :1
## NA's :148 NA's :143 NA's :14
## MD_EARN_WNE_P6 DEBT_MDN GRAD_DEBT_MDN
## Min. : 17400 Min. : 4563 Min. : 5600
## 1st Qu.: 31200 1st Qu.:14000 1st Qu.:20711
## Median : 35650 Median :16750 Median :23750
## Mean : 37775 Mean :16874 Mean :23078
## 3rd Qu.: 42000 3rd Qu.:19500 3rd Qu.:26256
## Max. :112100 Max. :29500 Max. :40000
## NA's :10 NA's :4 NA's :5
#take out unnesseary values
college_pc <- college_factors %>%
select(-c(name, ICLEVEL))
#remove NA and replace with 0s
college_pc[is.na(college_pc)] <- 0
principle_components <- prcomp(college_pc, scale. = T)
#look at principle compenents
principle_components$rotation[1:5,1:4]
## PC1 PC2 PC3 PC4
## room_and_board 0.31734579 -0.14009048 0.06844336 0.1722346
## in_state_tuition 0.33778060 -0.02410867 0.29070941 0.1835468
## out_of_state_tuition 0.37625596 -0.01076422 0.21450991 0.1581642
## mid_career_pay 0.36371271 0.17353522 -0.01947342 -0.1821042
## make_world_better_percent -0.08375642 0.02192628 -0.14910722 -0.6563008
std_dev <- principle_components$sdev
pr_var <- std_dev^2
prop_varex <- pr_var/sum(pr_var)
#plot changing variance
plot(cumsum(prop_varex), xlab = "Principle Component", ylab = "Cummulative Proportion of Variance Explained", type = "b")
abline(h = .95, col = "grey")
sum(prop_varex[1:9])
## [1] 0.9533869
With 9 principle components, 95% of the variance is explained.
#get principle components
college_pc <- cbind(college_pc, principle_components$x)
college_pc <- as.data.frame(college_pc[, 15:23])
head(college_pc)
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## 1 -2.3852394 -0.1951275 -1.3344202 0.2938433 -0.4548928 0.4556749 -0.5327095
## 2 0.2197772 -0.5067813 2.9680314 -0.3789071 -0.4780764 1.3683271 0.4793397
## 3 -2.6619101 -2.1818263 -0.2248277 -0.9136863 0.2041236 -0.2415222 0.8206367
## 4 -2.1079738 0.6094843 1.2866421 -0.8744944 -0.6563394 1.2337956 -0.9281638
## 5 3.7381933 -0.4000781 0.2602222 -5.2904344 -0.3734207 -1.6273135 -3.9086511
## 6 0.3902363 -1.6371283 1.0612125 -0.6332616 -0.9723017 -0.4513086 -0.2816824
## PC8 PC9
## 1 -0.8361891 0.2502482
## 2 -1.0519618 -0.3316535
## 3 -1.4647345 -0.2131491
## 4 -0.5627559 -0.7413327
## 5 1.7037832 0.1626732
## 6 0.6677532 0.3050402
set.seed(2345)
#create formula
kmean_withinss <- function(k) {
cluster <- kmeans(college_pc, k)
return (cluster$tot.withinss)
}
#look at best number of clusters
max_k <- 20
wss <- sapply(2:max_k, kmean_withinss)
elbow <- data.frame(2:max_k, wss)
ggplot(elbow, aes(x = X2.max_k, y = wss)) +
geom_point() +
geom_line() +
scale_x_continuous(breaks = seq(1, 20, by = 1))
We will choose 8 as the optimal cluster.
#examine clusters
pc_cluster <- kmeans(college_pc, 8)
pc_cluster$cluster
## [1] 3 4 3 5 8 8 8 3 7 8 6 1 7 8 3 7 5 5 6 8 3 8 3 8 3 3 1 3 5 1 3 3 7 4 8 5 8
## [38] 6 6 3 3 8 8 6 3 4 8 3 8 3 3 3 3 1 1 4 5 8 8 3 3 3 1 4 1 8 1 6 3 3 8 8 1 1
## [75] 8 8 3 8 1 3 3 8 7 7 8 3 3 3 8 6 3 8 7 3 6 8 8 1 3 6 4 8 3 8 3 8 3 3 8 1 4
## [112] 1 8 3 5 4 5 4 3 1 3 3 3 8 3 1 3 8 1 4 8 8 3 3 3 1 1 3 3 8 4 8 4 3 5 8 4 8
## [149] 3 1 8 3 3 3 5 3 3 3 3 8 8 8 1 3 3 4 3 3 8 3 3 3 3 3 8 3 3 8 5 3 3 8 3 3 3
## [186] 3 8 8 3 3 3 4 6 7 5 8 4 3 6 3 1 3 4 1 8 8 5 8 3 1 3 1 3 3 8 7 8 8 7 4 8 3
## [223] 3 6 8 3 8 3 3 3 3 3 8 8 3 8 1 3 4 4 5 3 3 3 3 3 3 3 1 7 8 4 4 1 4 3 2 3 3
## [260] 6 8 8 1 5 8 4 8 3 3 3 6 8 3 8 3 3 8 8 4 8 8 3 3 8 7 5 8 5 7 8 8 8 3 8 8 3
## [297] 3 1 8 3 8 3 3 8 2 8 8 4 7 3 6 3 8 8 6 3 3 3 1 3 3 6 3 8 8 8 3 3 3 3 3 8 6
## [334] 8 3 5 3 8 3 3 3 3 8 8 5 3 7 8 3 5 5 5 8 6 8 6 1 3 3 3 3 8 3 1 5 3 3 3 3 3
## [371] 3 3 3 6 3 6 3 1 4 8 8 3 3 8 8 8 3 4 3 3 3 6 3 3 3 2 3 8 3 8 7 8 6 6 3 7 3
## [408] 5 1 3 8 5 1 3 8 8 8 5 8 8 1 8 1 3 8 8 1 8 8 5 8 8 3 4 6 4 1 3 2 6 4 3 8 3
## [445] 8 8 8 8 8 8 4 8 8 5 4 8 3 1 8 8 8 8 8 3 3 3 8 4 3 8 3 3 3 3 5 3 1 6 5 3 3
## [482] 3 5 3 3 8 3 8 8 7 7 7 7 7 7 8 1 3 3 5 3 5 4 1 8 3 4 8 1 3 8 6 3 8 3 2 8 7
## [519] 5 6 7 3 3 8 3 8 1 8 3 3 1 3 8 7 3 3 5 3 3 3 3 6 3 2 3 8 3 3 3 3 3 1 8 3 8
## [556] 8 8 8 8 8 3 8 8 8 8 3 3 6 3 3 3 3 8 3 8 3 3 3 3 3 3 3 6 5 3 6 5 3 3 3 1 3
## [593] 3 3 8 2 8 8 3 3 8 3 3 3 3 3 3 3 1 8 1 6 8 4 8 1 1 8 8 8 3 8 3 3 3 1 3 3 3
## [630] 5 8 3 8 3 8 3 3 3 3 8 3 3 3 3 3 3 3 3 3 8 3 6 3 3 8 1 5 1 3 8 8 3 3 8 4 8
## [667] 8 6 3 1 4 3 6 3 1 6 3 1 8 3 4 3 3 3 6 3 3 5 6 3 3 8 6 5 3 1 8 3 8 4 3 8 3
## [704] 5 1 6 3 3 8 4 8 4 8 3 1 8
pc_cluster$centers
## PC1 PC2 PC3 PC4 PC5 PC6
## 1 4.827956 1.91841731 -0.428070580 0.24014651 -0.12111590 -0.25645429
## 2 -1.999867 4.71555786 1.304853532 -3.72829083 -0.89853133 -1.92865825
## 3 -1.187859 -0.36236013 -0.933205083 -0.03641196 -0.11016925 0.01717884
## 4 1.077969 0.09477732 3.152451490 0.12882713 -0.19064639 1.21091244
## 5 -1.969200 0.46210438 1.618518263 -0.14620149 -0.31774188 1.13539565
## 6 -3.689187 2.52431459 1.286423221 -0.30319942 0.19619910 -0.40404993
## 7 -1.197118 -0.02780196 1.327315833 2.52929662 2.40137745 -1.35098259
## 8 1.484192 -0.89339424 0.001398552 -0.15297638 -0.01482445 -0.11644834
## PC7 PC8 PC9
## 1 0.323219952 0.01379208 0.17753983
## 2 -1.076216470 -1.51907549 -0.41228215
## 3 -0.009033058 -0.05327904 0.08977257
## 4 -0.434501675 -0.16100360 -0.27624690
## 5 -0.343626465 -0.06744575 0.11803749
## 6 0.624902190 0.59072998 0.04522226
## 7 -0.426386456 -0.44736827 0.02014949
## 8 0.038064978 0.10931511 -0.15351054
pc_cluster$size
## [1] 62 7 296 43 42 43 26 197
#correlation matix
center <- pc_cluster$centers
cluster <- c(1:8)
centerdf <- data.frame(cluster, center)
head(centerdf)
## cluster PC1 PC2 PC3 PC4 PC5 PC6
## 1 1 4.827956 1.91841731 -0.4280706 0.24014651 -0.1211159 -0.25645429
## 2 2 -1.999867 4.71555786 1.3048535 -3.72829083 -0.8985313 -1.92865825
## 3 3 -1.187859 -0.36236013 -0.9332051 -0.03641196 -0.1101693 0.01717884
## 4 4 1.077969 0.09477732 3.1524515 0.12882713 -0.1906464 1.21091244
## 5 5 -1.969200 0.46210438 1.6185183 -0.14620149 -0.3177419 1.13539565
## 6 6 -3.689187 2.52431459 1.2864232 -0.30319942 0.1961991 -0.40404993
## PC7 PC8 PC9
## 1 0.323219952 0.01379208 0.17753983
## 2 -1.076216470 -1.51907549 -0.41228215
## 3 -0.009033058 -0.05327904 0.08977257
## 4 -0.434501675 -0.16100360 -0.27624690
## 5 -0.343626465 -0.06744575 0.11803749
## 6 0.624902190 0.59072998 0.04522226
center_reshape <- gather(centerdf, features, values, PC1:PC9)
ggplot(data = center_reshape, aes(x = features, y = cluster, fill = values)) +
scale_y_continuous(breaks = seq(1, 7, by = 1)) +
geom_tile() +
coord_equal() +
theme_classic()